{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "'''https://www.jianshu.com/p/795a5e2cd10c\n",
    "http://alwa.info/2016/09/26/Keras-%E5%AE%9E%E7%8E%B0-LSTM/\n",
    "'''\n",
    "'''This script loads pre-trained word embeddings (GloVe embeddings)\n",
    "into a frozen Keras Embedding layer, and uses it to\n",
    "train a text classification model on the 20 Newsgroup dataset\n",
    "(classication of newsgroup messages into 20 different categories).\n",
    "GloVe embedding data can be found at:\n",
    "http://nlp.stanford.edu/data/glove.6B.zip\n",
    "(source page: http://nlp.stanford.edu/projects/glove/)\n",
    "20 Newsgroup data can be found at:\n",
    "http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.html\n",
    "'''\n",
    "# 多分类问题  序列\n",
    "from __future__ import print_function\n",
    "import os\n",
    "import numpy as np\n",
    "np.random.seed(1337)\n",
    "\n",
    "from keras.preprocessing.text import Tokenizer\n",
    "from keras.preprocessing.sequence import pad_sequences\n",
    "from keras.utils.np_utils import to_categorical\n",
    "from keras.layers import Dense, Input, Flatten\n",
    "from keras.models import Sequential\n",
    "from keras.layers import Dense, Dropout, Activation, Embedding\n",
    "from keras.layers import Conv1D, MaxPooling1D, Embedding\n",
    "from keras.models import Model\n",
    "from keras.layers import LSTM, SimpleRNN, GRU\n",
    "import sys"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found 400000 word vectors.\n",
      "Processing text dataset\n",
      "Found 19997 texts.\n"
     ]
    }
   ],
   "source": [
    "BASE_DIR = 'D:/data analysis/data archives'\n",
    "GLOVE_DIR = BASE_DIR + '/glove.6B/'\n",
    "TEXT_DATA_DIR = BASE_DIR + '/20_newsgroup/'\n",
    "MAX_SEQUENCE_LENGTH = 1000\n",
    "MAX_NB_WORDS = 20000\n",
    "EMBEDDING_DIM = 100   # glove.6B ----->  array(1,100)\n",
    "VALIDATION_SPLIT = 0.2   # 验证\n",
    "batch_size = 32\n",
    "\n",
    "# first, build index mapping words in the embeddings set\n",
    "# to their embedding vector\n",
    "\n",
    "# embeddings_index = { \"word1\": \"vector array of (1, 100)\", \"word2\": \"vector array of (1, 100)\",  ...}   (20001,(1,100))\n",
    "embeddings_index = {}\n",
    "f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'), encoding='UTF-8')\n",
    "for line in f:\n",
    "    values = line.split()\n",
    "    word = values[0]\n",
    "    coefs = np.asarray(values[1:], dtype='float32')\n",
    "    embeddings_index[word] = coefs\n",
    "f.close()\n",
    "\n",
    "print('Found %s word vectors.' % len(embeddings_index))\n",
    "#Found 400000 word vectors.\n",
    "\n",
    "\n",
    "\n",
    "# second, prepare text samples and their labels\n",
    "print('Processing text dataset')\n",
    "\n",
    "texts = []  # list of text samples\n",
    "labels_index = {}  # dictionary mapping label name to numeric id   20 个dict\n",
    "labels = []  # list of label ids   20000个label  [0:1000]=0 [1000:2000]=1 ... （0-19)\n",
    "# iter 20 messages dir\n",
    "for name in sorted(os.listdir(TEXT_DATA_DIR)):\n",
    "    path = os.path.join(TEXT_DATA_DIR, name)\n",
    "    if os.path.isdir(path):\n",
    "        # 20 messages put tag 0-19\n",
    "        label_id = len(labels_index)\n",
    "        # 20 messages and its (index)label save to dict\n",
    "        labels_index[name] = label_id\n",
    "        for fname in sorted(os.listdir(path)):\n",
    "            if fname.isdigit():\n",
    "                fpath = os.path.join(path, fname)\n",
    "                # python 版本\n",
    "                if sys.version_info < (3,):\n",
    "                    f = open(fpath)\n",
    "                else:\n",
    "                    f = open(fpath, encoding='latin-1')\n",
    "                texts.append(f.read())\n",
    "                f.close()\n",
    "                labels.append(label_id)\n",
    "\n",
    "print('Found %s texts.' % len(texts))\n",
    "#Found 19997 texts."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Tokenizer()\n",
    "### somestr = ['ha ha gua angry',  'howa ha gua excited naive']\n",
    "### tok = tt.Tokenizer()\n",
    "### tok.fit_on_texts(somestr)\n",
    "\n",
    "### tok.word_index\n",
    "### Out[90]: {'angry': 3, 'excited': 5, 'gua': 2, 'ha': 1, 'howa': 4, 'naive': 6}\n",
    "\n",
    "### tok.texts_to_sequences(somestr)\n",
    "### Out[91]: [[1, 1, 2, 3], [4, 1, 2, 5, 6]]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "d:\\python\\lib\\site-packages\\keras_preprocessing\\text.py:177: UserWarning: The `nb_words` argument in `Tokenizer` has been renamed `num_words`.\n",
      "  warnings.warn('The `nb_words` argument in `Tokenizer` '\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "19997\n",
      "Found 214873 unique tokens.\n"
     ]
    }
   ],
   "source": [
    "# finally, vectorize the text samples into a 2D integer tensor\n",
    "# 生成的就是列数为MAX_NB_WORDS的matrix\n",
    "tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)\n",
    "tokenizer.fit_on_texts(texts)\n",
    "sequences = tokenizer.texts_to_sequences(texts)\n",
    "print(len(sequences)) # 19997\n",
    "# 生成不重复的  (unique) 索引(tokens) Out[90]: {'angry': 3, 'excited': 5, 'gua': 2, 'ha': 1, 'howa': 4, 'naive': 6}\n",
    "# 出现 21 万个单词\n",
    "word_index = tokenizer.word_index\n",
    "print('Found %s unique tokens.' % len(word_index))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### 上面的代码吧所有的单词都转换成了数字"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n",
      "1655\n"
     ]
    }
   ],
   "source": [
    "for i in range(100):\n",
    "    print(len(sequences[0]))# 1655"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Shape of data tensor: (19997, 1000)\n",
      "Shape of label tensor: (19997, 20)\n",
      "(15998, 1000)\n",
      "(15998, 20)\n",
      "3999\n",
      "Preparing embedding matrix.\n"
     ]
    }
   ],
   "source": [
    "# 取19997 篇 message 的前1000 个单词 \n",
    "# data 是一个长度为 1000 的 array，sequences 中不够长的部分被补0了\n",
    "data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)\n",
    "\n",
    "# labels 被转换成了 one-hot 编码的形式\n",
    "labels = to_categorical(np.asarray(labels))\n",
    "print('Shape of data tensor:', data.shape)\n",
    "print('Shape of label tensor:', labels.shape)\n",
    "\n",
    "# split the data into a training set and a validation set\n",
    "indices = np.arange(data.shape[0])\n",
    "np.random.shuffle(indices)\n",
    "data = data[indices]\n",
    "labels = labels[indices]\n",
    "nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])\n",
    "\n",
    "x_train = data[:-nb_validation_samples]\n",
    "y_train = labels[:-nb_validation_samples]\n",
    "x_val = data[-nb_validation_samples:]\n",
    "y_val = labels[-nb_validation_samples:]\n",
    "\n",
    "print(x_train.shape)\n",
    "print(y_train.shape)\n",
    "print(nb_validation_samples)\n",
    "\n",
    "print('Preparing embedding matrix.')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### embeddings_index  { \"word1\": \"vector array of (1, 100)\",...}       (400000)   (1,100)\n",
    "### word_index        {\"word\" : int_num }                              (214873)    1-214873\n",
    "### embedding_matrix  { int_num: \"vector array of (1, 100)\",...}       1-214873    (1,100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(20001, 100)\n"
     ]
    }
   ],
   "source": [
    "nb_words = min(MAX_NB_WORDS, len(word_index))  # 20000  214873\n",
    "#20000\n",
    "embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))  # 20001  100\n",
    "\n",
    "#  embeddings_index  a dict              {\"word1\": \"vector array of (1, 100)\",...}       (400000)   (1,100)\n",
    "#  word_index       a dict               {\"word\" : int_num }                              (214873)    1-214873\n",
    "#  embedding_matrix  matrix   not a dict { int_num: \"vector array of (1, 100)\",...}       (20001, 100)\n",
    "for word, i in word_index.items():\n",
    "    if i > MAX_NB_WORDS:\n",
    "        continue\n",
    "    # get word 对应的 vector  from  embeddings_index (dict)\n",
    "    embedding_vector = embeddings_index.get(word)\n",
    "    if embedding_vector is not None:\n",
    "        # words not found in embedding index will be all-zeros.\n",
    "        embedding_matrix[i] = embedding_vector\n",
    "\n",
    "print(embedding_matrix.shape)\n",
    "#(20001, 100)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "the 1\n",
      "edu 2\n",
      "to 3\n",
      "of 4\n"
     ]
    }
   ],
   "source": [
    "n = 0\n",
    "for k,v in word_index.items():\n",
    "    n+=1\n",
    "    if n<5:\n",
    "        print(k, v)\n",
    "    else:\n",
    "        break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2\n",
      "[ 0.052258    0.1337     -0.13601001 -0.53100002 -0.33232999  0.61752999\n",
      "  0.47343001 -0.29181001  0.55712003 -0.18652     0.50238001 -0.15098\n",
      " -0.68273002 -0.58508998  0.34746999  0.82177001  0.31852999  0.95371997\n",
      "  0.24924999  0.38438001 -0.73689002  0.50779998 -0.20254     0.79536998\n",
      " -0.26524001 -0.25088    -0.54742998  0.17678    -0.016369   -0.58517998\n",
      "  0.20823     0.7511     -0.6476      0.76545     0.35394999 -0.63657999\n",
      "  0.097478    0.55644    -0.48076001  0.50494999  0.67242002  0.33906001\n",
      " -0.66140997  0.85762    -0.0035989   0.43921    -0.72952998  0.31665999\n",
      " -0.1517     -0.13259999  0.30517     0.18661    -0.97727001 -0.46492001\n",
      " -0.55678999  0.11558    -0.29574001  0.35183999 -0.46292999  0.47373\n",
      "  0.52429998 -0.09943     0.34402001  0.46485999  0.23965999  0.92896998\n",
      "  0.30223    -0.14928     0.47215     0.21781     0.035236    0.047968\n",
      "  0.090056   -1.52830005 -0.08229    -0.41813999 -0.0087449   0.28643\n",
      "  0.037936   -0.44747001  0.38986    -0.48626    -0.11746     0.32335001\n",
      "  0.080105   -0.15143     0.35924     0.80868     0.22387999 -0.98461002\n",
      " -1.41620004  0.57876998 -0.75010002  1.08080006 -0.39853001 -0.33945999\n",
      "  1.08490002 -0.24304999 -0.71566999 -0.42782   ]\n",
      "[ 0.052258   0.1337    -0.13601   -0.531     -0.33233    0.61753\n",
      "  0.47343   -0.29181    0.55712   -0.18652    0.50238   -0.15098\n",
      " -0.68273   -0.58509    0.34747    0.82177    0.31853    0.95372\n",
      "  0.24925    0.38438   -0.73689    0.5078    -0.20254    0.79537\n",
      " -0.26524   -0.25088   -0.54743    0.17678   -0.016369  -0.58518\n",
      "  0.20823    0.7511    -0.6476     0.76545    0.35395   -0.63658\n",
      "  0.097478   0.55644   -0.48076    0.50495    0.67242    0.33906\n",
      " -0.66141    0.85762   -0.0035989  0.43921   -0.72953    0.31666\n",
      " -0.1517    -0.1326     0.30517    0.18661   -0.97727   -0.46492\n",
      " -0.55679    0.11558   -0.29574    0.35184   -0.46293    0.47373\n",
      "  0.5243    -0.09943    0.34402    0.46486    0.23966    0.92897\n",
      "  0.30223   -0.14928    0.47215    0.21781    0.035236   0.047968\n",
      "  0.090056  -1.5283    -0.08229   -0.41814   -0.0087449  0.28643\n",
      "  0.037936  -0.44747    0.38986   -0.48626   -0.11746    0.32335\n",
      "  0.080105  -0.15143    0.35924    0.80868    0.22388   -0.98461\n",
      " -1.4162     0.57877   -0.7501     1.0808    -0.39853   -0.33946\n",
      "  1.0849    -0.24305   -0.71567   -0.42782  ]\n",
      "[ True  True  True  True  True  True  True  True  True  True  True  True\n",
      "  True  True  True  True  True  True  True  True  True  True  True  True\n",
      "  True  True  True  True  True  True  True  True  True  True  True  True\n",
      "  True  True  True  True  True  True  True  True  True  True  True  True\n",
      "  True  True  True  True  True  True  True  True  True  True  True  True\n",
      "  True  True  True  True  True  True  True  True  True  True  True  True\n",
      "  True  True  True  True  True  True  True  True  True  True  True  True\n",
      "  True  True  True  True  True  True  True  True  True  True  True  True\n",
      "  True  True  True  True]\n"
     ]
    }
   ],
   "source": [
    "# 验证\n",
    "num = word_index[\"edu\"]\n",
    "print(num)\n",
    "print(embedding_matrix[num])\n",
    "print(embeddings_index[\"edu\"])\n",
    "print(embedding_matrix[num]==embeddings_index[\"edu\"])\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2.6 LSTM训练"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "d:\\python\\lib\\site-packages\\ipykernel_launcher.py:10: UserWarning: The `dropout` argument is no longer support in `Embedding`. You can apply a `keras.layers.SpatialDropout1D` layer right after the `Embedding` layer to get the same behavior.\n",
      "  # Remove the CWD from sys.path while we load stuff.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training model.\n",
      "(15998, 1000)\n"
     ]
    }
   ],
   "source": [
    "# data的shape （19997,100）\n",
    "\n",
    "# load pre-trained word embeddings into an Embedding layer\n",
    "# note that we set trainable = False so as to keep the embeddings fixed\n",
    "embedding_layer = Embedding(nb_words + 1,\n",
    "                            EMBEDDING_DIM,\n",
    "                            weights=[embedding_matrix],\n",
    "                            input_length=MAX_SEQUENCE_LENGTH,\n",
    "                            trainable=False,\n",
    "                            dropout=0.2)\n",
    "\n",
    "print('Training model.')\n",
    "\n",
    "# train a 1D convnet with global maxpooling\n",
    "sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')\n",
    "embedded_sequences = embedding_layer(sequence_input)\n",
    "x = Conv1D(128, 5, activation='relu')(embedded_sequences)\n",
    "x = MaxPooling1D(5)(x)\n",
    "x = Conv1D(128, 5, activation='relu')(x)\n",
    "x = MaxPooling1D(5)(x)\n",
    "x = Conv1D(128, 5, activation='relu')(x)\n",
    "x = MaxPooling1D(35)(x)\n",
    "x = Flatten()(x)\n",
    "x = Dense(128, activation='relu')(x)\n",
    "\n",
    "preds = Dense(len(labels_index), activation='softmax')(x)\n",
    "\n",
    "model = Model(sequence_input, preds)\n",
    "model.compile(loss='categorical_crossentropy',\n",
    "              optimizer='rmsprop',\n",
    "              metrics=['acc'])\n",
    "print(x_train.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "20000 1000\n",
      "(15998, 1000) (15998, 20) (3999, 1000) (3999, 20)\n",
      "[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n"
     ]
    }
   ],
   "source": [
    "print(nb_words, MAX_SEQUENCE_LENGTH)\n",
    "print(x_train.shape, y_train.shape, x_val.shape, y_val.shape)\n",
    "print(y_train[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "d:\\python\\lib\\site-packages\\ipykernel_launcher.py:6: UserWarning: The `dropout` argument is no longer support in `Embedding`. You can apply a `keras.layers.SpatialDropout1D` layer right after the `Embedding` layer to get the same behavior.\n",
      "  \n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Build model...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "d:\\python\\lib\\site-packages\\ipykernel_launcher.py:14: UserWarning: Update your `LSTM` call to the Keras 2 API: `LSTM(100, dropout=0.2, recurrent_dropout=0.2)`\n",
      "  \n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "d:\\python\\lib\\site-packages\\ipykernel_launcher.py:26: UserWarning: The `nb_epoch` argument in `fit` has been renamed `epochs`.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 15998 samples, validate on 3999 samples\n",
      "Epoch 1/5\n",
      "15998/15998 [==============================] - 474s 30ms/step - loss: 0.1981 - acc: 0.9500 - val_loss: 0.1971 - val_acc: 0.9500\n",
      "Epoch 2/5\n",
      "15998/15998 [==============================] - 480s 30ms/step - loss: 0.1970 - acc: 0.9500 - val_loss: 0.1986 - val_acc: 0.9500\n",
      "Epoch 3/5\n",
      "15998/15998 [==============================] - 471s 29ms/step - loss: 0.1984 - acc: 0.9500 - val_loss: 0.1941 - val_acc: 0.9500\n",
      "Epoch 4/5\n",
      "15998/15998 [==============================] - 462s 29ms/step - loss: 0.1981 - acc: 0.9500 - val_loss: 0.1984 - val_acc: 0.9500\n",
      "Epoch 5/5\n",
      "15998/15998 [==============================] - 472s 30ms/step - loss: 0.1919 - acc: 0.9500 - val_loss: 0.1881 - val_acc: 0.9500\n",
      "3999/3999 [==============================] - 27s 7ms/step\n",
      "Test score: 0.18813910955531982\n",
      "Test accuracy: 0.949999988079071\n"
     ]
    }
   ],
   "source": [
    "embedding_layer = Embedding(nb_words + 1,\n",
    "                            EMBEDDING_DIM,\n",
    "                            weights=[embedding_matrix],\n",
    "                            input_length=MAX_SEQUENCE_LENGTH,\n",
    "                            trainable=False,\n",
    "                            dropout=0.2)\n",
    "batch_size = 32\n",
    "\n",
    "print('Build model...')\n",
    "# sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')\n",
    "# embedded_sequences = embedding_layer()\n",
    "model = Sequential()\n",
    "model.add(embedding_layer)\n",
    "model.add(LSTM(100, dropout_W=0.2, dropout_U=0.2))  # try using a GRU instead, for fun\n",
    "model.add(Dense(1))\n",
    "model.add(Activation('sigmoid'))\n",
    "model.add(Dense(len(labels_index), activation='softmax'))\n",
    "\n",
    "# try using different optimizers and different optimizer configs\n",
    "model.compile(loss='binary_crossentropy',\n",
    "              optimizer='adam',\n",
    "              metrics=['accuracy'])\n",
    "\n",
    "print('Train...')\n",
    "model.fit(x_train, y_train, batch_size=batch_size, nb_epoch=5,\n",
    "          validation_data=(x_val, y_val))\n",
    "score, acc = model.evaluate(x_val, y_val,\n",
    "                            batch_size=batch_size)\n",
    "print('Test score:', score)\n",
    "print('Test accuracy:', acc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_________________________________________________________________\n",
      "Layer (type)                 Output Shape              Param #   \n",
      "=================================================================\n",
      "embedding_4 (Embedding)      (None, 1000, 100)         2000100   \n",
      "_________________________________________________________________\n",
      "lstm_2 (LSTM)                (None, 100)               80400     \n",
      "_________________________________________________________________\n",
      "dense_5 (Dense)              (None, 1)                 101       \n",
      "_________________________________________________________________\n",
      "activation_2 (Activation)    (None, 1)                 0         \n",
      "_________________________________________________________________\n",
      "dense_6 (Dense)              (None, 20)                40        \n",
      "=================================================================\n",
      "Total params: 2,080,641\n",
      "Trainable params: 80,541\n",
      "Non-trainable params: 2,000,100\n",
      "_________________________________________________________________\n"
     ]
    }
   ],
   "source": [
    "model.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "d:\\python\\lib\\site-packages\\ipykernel_launcher.py:3: UserWarning: The `nb_epoch` argument in `fit` has been renamed `epochs`.\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 15998 samples, validate on 3999 samples\n",
      "Epoch 1/2\n",
      "15998/15998 [==============================] - 681s 43ms/step - loss: 0.1880 - acc: 0.9500 - val_loss: 0.1871 - val_acc: 0.9500\n",
      "Epoch 2/2\n",
      "15998/15998 [==============================] - 807s 50ms/step - loss: 0.1867 - acc: 0.9500 - val_loss: 0.1859 - val_acc: 0.9500\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<keras.callbacks.History at 0x25d716a0>"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# happy learning!\n",
    "model.fit(x_train, y_train, validation_data=(x_val, y_val),\n",
    "          nb_epoch=2, batch_size=128)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
